import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as ply
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn import metrics
import warnings
from IPython.display import Image
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from itertools import combinations
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.metrics import confusion_matrix, classification_report
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')
Lending institutions all over the world need to classify whether a loan is acceptable (i.e., good) or not (i.e., default) to make their lending decisions. They can do this by relating the outcomes of those loans which were given (1= default, 0= good) to features such as home ownership, annual income, and the debt to income ratio of the credit applicant.
Image(filename='Lending-Club.jpg')
Gain insights on the parameters that affect the default of loan given to credit applicant by LendingClub according to various parameters and build a model that predicts if loan was defaulted
Or Maybe we don't have enough information to classify correctly if loan was defaulted and it depends on more feature like, lender character?
Lending Club is a peer-to-peer lender that allows investors to lend money to borrowers witout intermediary being involved.
#reading loan df
df= pd.read_csv('Lending_Club.csv')
df.columns
df.describe().T
df.info()
dfColumns=[col.strip().upper() for col in df.columns]
df.columns=dfColumns
print("Lending Club DF with NA values:")
print(df.columns[df.isna().any()].tolist())
df.columns
print("Counting NA values per recognized columns with NA:")
print("EMPLOYMENT_LENGTH NA Valus:"+ str(df.EMPLOYMENT_LENGTH.isna().sum()))
df.info()
# filling recordes with A values
df_loan = df.copy()
df_loan.fillna(method='ffill', inplace=True)
# In Addition removing unnecessary column
df_loan=df_loan.drop(["LOAN_ID","BORROWER_ID"], axis=1)
print("Current columns with NA values")
print(df_loan.columns[df_loan.isna().any()].tolist())
#Removing records where is NA
df_loan=df_loan.dropna()
df_loan.info()
df_loan.describe().style.apply(lambda x: ["background: yellow" if v <= 0 else "" for v in x], axis = 1)
print("LOAN_AMOUNT unique values:"+str(df_loan.LOAN_AMOUNT.unique()))
print("LOAN_TERM unique values:"+str(df_loan.LOAN_TERM.unique()))
print("INTEREST_RATE unique values:"+str(df_loan.INTEREST_RATE.unique()))
print("MONTHLY_PAYMENT unique values:"+str(df_loan.MONTHLY_PAYMENT.unique()))
print("LOAN_PURPOSE unique values:"+str(df_loan.LOAN_PURPOSE.unique()))
print("EMPLOYMENT_LENGTH unique values:"+str(df_loan.EMPLOYMENT_LENGTH.unique()))
print("HOUSING unique values:"+str(df_loan.HOUSING.unique()))
print("ANNUAL_INCOME unique values:"+str(df_loan.ANNUAL_INCOME.unique()))
print("DEBT_TO_INCOME unique values:"+str(df_loan.DEBT_TO_INCOME.unique()))
print("DEFAULT unique values:"+str(df_loan.DEFAULT.unique()))
print("LOAN_AMOUNT values count:")
print(df_loan.LOAN_AMOUNT.value_counts())
print("LOAN_TERM values count:")
print(df_loan.LOAN_TERM.value_counts())
print("LOAN_PURPOSE values count:")
print(df_loan.LOAN_PURPOSE.value_counts())
print("EMPLOYMENT_LENGTH values count:")
print(df_loan.EMPLOYMENT_LENGTH.value_counts())
print("HOUSING values count:")
print(df_loan.HOUSING.value_counts())
print("ANNUAL_INCOME values count:")
print(df_loan.ANNUAL_INCOME.value_counts())
print("DEBT_TO_INCOME values count:")
print(df_loan.DEBT_TO_INCOME.value_counts())
print("DEFAULT values count:")
print(df_loan.DEFAULT.value_counts())
for i in range (0, len(df_loan['EMPLOYMENT_LENGTH'])):
if df_loan['EMPLOYMENT_LENGTH'][i] == "< 1 year":
df_loan['EMPLOYMENT_LENGTH'][i]= "< 1 year"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "1 year" or df_loan['EMPLOYMENT_LENGTH'][i] == "2 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "1-2 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "3 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "4 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "3-4 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "5 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "6 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "5-6 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "7 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "8 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "7-8 Years"
elif df_loan['EMPLOYMENT_LENGTH'][i] == "9 years" or df_loan['EMPLOYMENT_LENGTH'][i] == "10 years":
df_loan['EMPLOYMENT_LENGTH'][i]= "9-10 Years"
else:
df_loan['EMPLOYMENT_LENGTH'][i]= ">10 Years"
print("EMPLOYMENT_LENGTH :", df_loan['EMPLOYMENT_LENGTH'].unique())
df_loan.describe().T
left = df_loan.groupby('DEFAULT')
left.mean()
sns.countplot(x='DEFAULT', data=df_loan)
plt.xlabel('Loan Outcome')
plt.ylabel("Number of Loans")
plt.title("Loan Classification")
plt.show()
pd.value_counts(df_loan['DEFAULT'].values,normalize=True)
df_loan['DEFAULT'].value_counts()
df_plot = df_loan['DEFAULT'].value_counts().reset_index()
plot_data = [
go.Bar(
x=df_plot['index'],
y=df_plot['DEFAULT'],
width = [0.5, 0.5],
marker=dict(
color=['blue', 'orange'])
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
yaxis={"title": "DEFAULT"},
title='DEFAULT',
plot_bgcolor = 'rgb(243,243,243)',
paper_bgcolor = 'rgb(243,243,243)',
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
There are 22442 no default loans and 2557 default loans in the outcome variables.
count_no_default = len(df_loan[df_loan['DEFAULT']==0])
count_default = len(df_loan[df_loan['DEFAULT']==1])
pct_of_no_default = count_no_default/(count_no_default+count_default)
print("percentage of no default is", pct_of_no_default*100)
pct_of_default = count_default/(count_no_default+count_default)
print("percentage of default", pct_of_default*100)
%matplotlib inline
pd.crosstab(df_loan.HOUSING,df_loan.DEFAULT).plot(kind='bar')
plt.title('Default Frequency for Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Number of Loans')
plt.savefig('default_fre_home')
table=pd.crosstab(df_loan['EMPLOYMENT_LENGTH'],df_loan['DEFAULT'])
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of EMPLOYMENT_LENGTH vs Default')
plt.xlabel('EMPLOYMENT_LENGTH')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 10.0)
import matplotlib.pyplot as plt
import seaborn as sns
sns.kdeplot(df_loan['ANNUAL_INCOME'].loc[df_loan['DEFAULT'] == 0], label='not default', shade=True);
sns.kdeplot(df_loan['ANNUAL_INCOME'].loc[df_loan['DEFAULT'] == 1], label='not default', shade=True);
num_projects=df_loan.groupby('EMPLOYMENT_LENGTH').count()
plt.barh(num_projects.index.values, num_projects['LOAN_AMOUNT'], color=['blue', 'orange','green','red','purple','chocolate','pink','grey'])
plt.xlabel('Number of Loans')
plt.ylabel("borrower's Employment Length")
plt.title('Loans Frequency for Employment Length')
plt.show()
num_projects=df_loan.groupby('LOAN_PURPOSE').count()
plt.barh(num_projects.index.values, num_projects['LOAN_AMOUNT'], color=['blue', 'orange','green','red','purple','chocolate','pink','grey'])
plt.xlabel('Number of Loans')
plt.ylabel("borrower's Employment Length")
plt.title('Loans Frequency for Loan purpose')
plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
df_loan.hist(bins=10, figsize=(20,15))
plt.savefig("attribute_histogram_plots")
plt.show()
features2=['LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING', 'DEFAULT']
fig=plt.subplots(figsize=(30,30))
for i, j in enumerate(features2):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = df_loan)
plt.xticks(rotation=90)
plt.title("Number of Loans")
fig=plt.subplots(figsize=(30,30))
for i, j in enumerate(features2):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = df_loan, hue='DEFAULT')
plt.xticks(rotation=90)
plt.title("Number of Loans")
df_plot = df_loan.groupby('LOAN_AMOUNT').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['LOAN_AMOUNT'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "LOAN_AMOUNT"},
title='LOAN_AMOUNT vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.LOAN_AMOUNT.hist(bins=50, figsize=(20,15), color=['green'])
plt.savefig("LOAN_AMOUNT_histogram_plot")
plt.show()
#mean DEFAULT by LOAN_TERM
ax=df_loan.groupby(['LOAN_TERM']).mean().reindex([' 60 months', ' 36 months'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['red','purple'])
ax.set_title('DEFAULT mean LOAN_TERM')
ax.set_ylabel('DEFAULT mean')
df_plot = df_loan.groupby('INTEREST_RATE').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['INTEREST_RATE'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "INTEREST_RATE"},
title='INTEREST_RATE vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.INTEREST_RATE.hist(bins=50, figsize=(20,15),color=['chocolate'])
plt.savefig("INTEREST_RATE_histogram_plot")
plt.show()
df_plot = df_loan.groupby('MONTHLY_PAYMENT').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['MONTHLY_PAYMENT'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "MONTHLY_PAYMENT"},
title='MONTHLY_PAYMENT vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.MONTHLY_PAYMENT.hist(bins=50, figsize=(20,15),color=['pink'])
plt.savefig("MONTHLY_PAYMENT_histogram_plot")
plt.show()
#mean DEFAULT by LOAN_PURPOSE
ax=df_loan.groupby(['LOAN_PURPOSE']).mean().reindex(['debt_consolidation', 'credit_card',\
'small_business', 'medical', 'other',\
'vacation', 'house', 'major_purchase',\
'home_improvement', 'wedding', 'car',\
'moving', 'renewable_energy', 'educational'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['blue', 'orange','green','red','purple','chocolate','pink','grey','black','brown','yellow'])
ax.set_title('DEFAULT mean LOAN_PURPOSE')
ax.set_ylabel('DEFAULT mean')
#mean DEFAULT by EMPLOYMENT_LENGTH
ax=df_loan.groupby(['EMPLOYMENT_LENGTH']).mean().reindex(['< 1 year','1-2 Years','3-4 Years',\
'5-6 Years','7-8 Years','9-10 Years',\
'>10 Years'
])['DEFAULT'].plot.bar(figsize=(12, 5),color=['blue', 'orange','green','red','purple','chocolate','pink','grey','black','brown','yellow'])
ax.set_title('DEFAULT mean EMPLOYMENT_LENGTH')
ax.set_ylabel('DEFAULT mean')
#mean DEFAULT by HOUSING
ax=df_loan.groupby(['HOUSING']).mean().reindex(['yes', 'no'])['DEFAULT'].plot.bar(figsize=(12, 5),color=['green','red'])
ax.set_title('DEFAULT mean HOUSING')
ax.set_ylabel('DEFAULT mean')
df_plot = df_loan.groupby('ANNUAL_INCOME').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['ANNUAL_INCOME'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "ANNUAL_INCOME"},
title='ANNUAL_INCOME vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.ANNUAL_INCOME.hist(bins=50, figsize=(20,15),color=['grey'])
plt.savefig("ANNUAL_INCOME_histogram_plot")
plt.show()
df_plot = df_loan.groupby('DEBT_TO_INCOME').DEFAULT.mean().reset_index()
plot_data = [
go.Scatter(
x=df_plot['DEBT_TO_INCOME'],
y=df_plot['DEFAULT'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
),
)
]
plot_layout = go.Layout(
yaxis= {'title': "DEFAULT"},
xaxis= {'title': "DEBT_TO_INCOME"},
title='DEBT_TO_INCOME vs DEFAULT',
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
fig = go.Figure(data=plot_data, layout=plot_layout)
ply.iplot(fig)
df_loan.DEBT_TO_INCOME.hist(bins=50, figsize=(20,15),color=['yellow'])
plt.savefig("DEBT_TO_INCOME_histogram_plot")
plt.show()
#move down
sns.pairplot(x_vars=['MONTHLY_PAYMENT'], y_vars=['LOAN_AMOUNT'], data=df_loan, hue ="LOAN_TERM", size=5)
plt.title("MONTHLY_PAYMENT vs LOAN_AMOUNT by EMPLOYMENT_LENGTH")
features=['LOAN_AMOUNT', 'LOAN_TERM', 'INTEREST_RATE',
'MONTHLY_PAYMENT', 'LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING',
'ANNUAL_INCOME', 'DEBT_TO_INCOME', 'DEFAULT']
sns.set_style()
corr = df_loan[features].corr()
sns.heatmap(corr,cmap="RdYlBu",vmin=-1,vmax=1)
plt.title("correlation heat map")
plt.show()
features=['LOAN_AMOUNT', 'LOAN_TERM', 'INTEREST_RATE',
'MONTHLY_PAYMENT', 'LOAN_PURPOSE', 'EMPLOYMENT_LENGTH', 'HOUSING',
'ANNUAL_INCOME', 'DEBT_TO_INCOME', 'DEFAULT']
mask = np.zeros_like(df_loan[features].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(16, 12))
plt.title('Correlation Matrix',fontsize=25)
sns.heatmap(df_loan[features].corr(),vmax=1.0,square=True,cmap="RdYlBu",
linecolor='w',annot=True,mask=mask,cbar_kws={"shrink": .75})
Now let’s look at how much each independent variable correlates with the DEFAULT.
corr_matrix = df_loan.corr()
corr_matrix["DEFAULT"].sort_values(ascending=False)
attributes = ["DEFAULT", "INTEREST_RATE", "MONTHLY_PAYMENT",'LOAN_AMOUNT','DEBT_TO_INCOME']
scatter_matrix(df_loan[attributes], figsize=(12, 8))
plt.savefig('matrix.png')
Create dummy variables for four categorical variables.
df_loan_dummies=pd.get_dummies(df_loan,columns=['LOAN_TERM','LOAN_PURPOSE','EMPLOYMENT_LENGTH', 'HOUSING'])
df_loan_dummies.head().T
df_loan_dummies.to_csv("df_loan_dummies.csv",index=False)
df_loan_dummies.columns
X = df_loan_dummies.drop("DEFAULT",axis=1)
y = df_loan_dummies["DEFAULT"]
y=y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=47, stratify=y)
df_loan_dummies["DEFAULT"].value_counts()
sns.countplot(x="DEFAULT",data=df_loan_dummies)
plt.show()
count_no_default = len(df_loan_dummies[df_loan_dummies['DEFAULT']==0])
count_default = len(df_loan_dummies[df_loan_dummies['DEFAULT']==1])
pct_of_no_default = count_no_default/(count_no_default+count_default)
print("\033[1m percentage of no default is", pct_of_no_default*100)
pct_of_default = count_default/(count_no_default+count_default)
print("\033[1m percentage of default", pct_of_default*100)
Our classes are imbalanced, and the ratio of no-default to default instances is 90:10.
With our training data created, I’ll up-sample the default using the SMOTE algorithm (Synthetic Minority Oversampling Technique). At a high level, SMOTE:
We are going to implement SMOTE in Python.
os = SMOTE(random_state=47)
columns1 = X.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns1)
os_data_y= pd.DataFrame(data=os_data_y,columns=['DEFAULT'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no default in oversampled data",len(os_data_y[os_data_y['DEFAULT']==0]))
print("Number of default",len(os_data_y[os_data_y['DEFAULT']==1]))
print("Proportion of no default data in oversampled data is ",len(os_data_y[os_data_y['DEFAULT']==0])/len(os_data_X))
print("Proportion of default data in oversampled data is ",len(os_data_y[os_data_y['DEFAULT']==1])/len(os_data_X))
cols=columns1
X_train=os_data_X[cols]
y_train=os_data_y['DEFAULT']
X_train.columns
ax=pd.value_counts(y_train.values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','orange'])
ax.set_title('DEFAULT')
ax.set_xticklabels(['DEFAULT=True','NO DEFAULT=False'])
plt.show()
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before balancing LOAN_TERM_ 36 months")
pd.value_counts(df_loan_dummies['LOAN_TERM_ 36 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_TERM_ 36 months")
pd.value_counts(X_train['LOAN_TERM_ 36 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_TERM_ 60 months")
pd.value_counts(df_loan_dummies['LOAN_TERM_ 60 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_TERM_ 60 months")
pd.value_counts(X_train['LOAN_TERM_ 60 months'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_car")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_car'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_car")
pd.value_counts(X_train['LOAN_PURPOSE_car'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_credit_card")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_credit_card'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_credit_card")
pd.value_counts(X_train['LOAN_PURPOSE_credit_card'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_debt_consolidation")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_debt_consolidation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_debt_consolidation")
pd.value_counts(X_train['LOAN_PURPOSE_debt_consolidation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_home_improvement")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_home_improvement'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_home_improvement")
pd.value_counts(X_train['LOAN_PURPOSE_home_improvement'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_major_purchase")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_major_purchase'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_major_purchase")
pd.value_counts(X_train['LOAN_PURPOSE_major_purchase'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_medical")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_medical'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_medical")
pd.value_counts(X_train['LOAN_PURPOSE_medical'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_moving")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_moving'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_moving")
pd.value_counts(X_train['LOAN_PURPOSE_moving'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_other")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_other'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_other")
pd.value_counts(X_train['LOAN_PURPOSE_other'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before LOAN_PURPOSE_vacation")
pd.value_counts(df_loan_dummies['LOAN_PURPOSE_vacation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing LOAN_PURPOSE_vacation")
pd.value_counts(X_train['LOAN_PURPOSE_vacation'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_1-2 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_1-2 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_1-2 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_1-2 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_3-4 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_3-4 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_3-4 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_3-4 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_5-6 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_5-6 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_5-6 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_5-6 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_7-8 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_7-8 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_7-8 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_7-8 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_9-10 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_9-10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_9-10 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_9-10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_< 1 year")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_< 1 year'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_< 1 year")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_< 1 year'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before EMPLOYMENT_LENGTH_>10 Years")
pd.value_counts(df_loan_dummies['EMPLOYMENT_LENGTH_>10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing EMPLOYMENT_LENGTH_>10 Years")
pd.value_counts(X_train['EMPLOYMENT_LENGTH_>10 Years'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before HOUSING_yes")
pd.value_counts(df_loan_dummies['HOUSING_yes'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing HOUSING_yes")
pd.value_counts(X_train['HOUSING_yes'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.figure(figsize=(25,25))
plt.subplot(1, 2, 1)
plt.title("Data before HOUSING_no")
pd.value_counts(df_loan_dummies['HOUSING_no'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
plt.subplot(1, 2, 2)
plt.title("Data after balancing HOUSING_no")
pd.value_counts(X_train['HOUSING_no'].values,normalize=True).plot.bar(figsize=(12, 5),color=['blue','darkorange'])
relevantColumns=['LOAN_TERM_ 36 months', 'LOAN_TERM_ 60 months', 'LOAN_PURPOSE_car', 'LOAN_PURPOSE_credit_card', 'LOAN_PURPOSE_debt_consolidation', 'LOAN_PURPOSE_home_improvement', 'LOAN_PURPOSE_major_purchase', 'LOAN_PURPOSE_medical', 'LOAN_PURPOSE_moving', 'LOAN_PURPOSE_other', 'LOAN_PURPOSE_vacation', 'EMPLOYMENT_LENGTH_1-2 Years', 'EMPLOYMENT_LENGTH_3-4 Years', 'EMPLOYMENT_LENGTH_5-6 Years', 'EMPLOYMENT_LENGTH_7-8 Years', 'EMPLOYMENT_LENGTH_9-10 Years', 'EMPLOYMENT_LENGTH_< 1 year', 'EMPLOYMENT_LENGTH_>10 Years', 'HOUSING_no', 'HOUSING_yes','DEFAULT']
plt.figure(figsize=(10,10))
sns.set_style()
corr = df_loan_dummies[relevantColumns].corr()
sns.heatmap(corr,cmap="RdYlBu",vmin=-1,vmax=1)
plt.title("correlation heat map")
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(knn.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(knn.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc1=knn.score(X_test, y_test)
print(acc1)
y_pred1 = knn.predict(X_test)
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_true=y_test, y_pred=y_pred1)
cmDf1=pd.DataFrame(cm1, index=knn.classes_, columns=knn.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf1)
print("\033[1m The result is telling us that we have: ",(cm1[0,0]+cm1[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm1[0,1]+cm1[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm1.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred1))
per1=metrics.precision_score(y_test, y_pred1)
rec1=metrics.recall_score(y_test, y_pred1)
log1=metrics.log_loss(y_test, y_pred1)
print("\033[1m Precision of the model:", "{:.2%}".format(per1))
print("\033[1m Recall of the model:", "{:.2%}".format(rec1))
print("\033[1m Log Loss of the model:", round(log1,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc1 = roc_auc_score(y_test, y_pred1)
fpr, tpr, thresholds = roc_curve(y_test, knn.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='K-Nearest Neighbors (area = %0.4f)' % roc1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs1=metrics.f1_score(y_test, y_pred1)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs1))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc1))
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1, random_state=47)
logreg.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(logreg.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(logreg.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc2=logreg.score(X_test, y_test)
print(acc2)
y_pred2 = logreg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_true=y_test, y_pred=y_pred2)
cmDf2=pd.DataFrame(cm2, index=logreg.classes_, columns=logreg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf2)
print("\033[1m The result is telling us that we have: ",(cm2[0,0]+cm2[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm2[0,1]+cm2[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm2.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred2))
per2=metrics.precision_score(y_test, y_pred2)
rec2=metrics.recall_score(y_test, y_pred2)
log2=metrics.log_loss(y_test, y_pred2)
print("\033[1m Precision of the model:", "{:.2%}".format(per2))
print("\033[1m Recall of the model:", "{:.2%}".format(rec2))
print("\033[1m Log Loss of the model:", round(log2,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc2 = roc_auc_score(y_test, y_pred2)
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.4f)' % roc2)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs2=metrics.f1_score(y_test, y_pred2)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs2))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc2))
from sklearn.svm import SVC
svc = SVC(probability=True, C=0.1, gamma=0.001, random_state=47)
svc.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(svc.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(svc.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc3=svc.score(X_test, y_test)
print(acc3)
y_pred3 = svc.predict(X_test)
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_true=y_test, y_pred=y_pred3)
cmDf3=pd.DataFrame(cm3, index=svc.classes_, columns=svc.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf3)
print("\033[1m The result is telling us that we have: ",(cm3[0,0]+cm3[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm3[0,1]+cm3[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm3.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred3))
per3=metrics.precision_score(y_test, y_pred3)
rec3=metrics.recall_score(y_test, y_pred3)
log3=metrics.log_loss(y_test, y_pred3)
print("\033[1m Precision of the model:", "{:.2%}".format(per3))
print("\033[1m Recall of the model:", "{:.2%}".format(rec3))
print("\033[1m Log Loss of the model:", round(log3,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc3 = roc_auc_score(y_test, y_pred3)
fpr, tpr, thresholds = roc_curve(y_test, svc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Support Vector Machine (area = %0.4f)' % roc3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs3=metrics.f1_score(y_test, y_pred3)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs3))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc3))
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
dt = DecisionTreeClassifier(max_depth=1, random_state=47)
dt.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(dt.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(dt.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc4=dt.score(X_test, y_test)
print(acc4)
y_pred4 = dt.predict(X_test)
from sklearn.metrics import confusion_matrix
cm4 = confusion_matrix(y_true=y_test, y_pred=y_pred4)
cmDf4=pd.DataFrame(cm4, index=dt.classes_, columns=dt.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf4)
print("\033[1m The result is telling us that we have: ",(cm4[0,0]+cm4[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm4[0,1]+cm4[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm4.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred4))
per4=metrics.precision_score(y_test, y_pred4)
rec4=metrics.recall_score(y_test, y_pred4)
log4=metrics.log_loss(y_test, y_pred4)
print("\033[1m Precision of the model:", "{:.2%}".format(per4))
print("\033[1m Recall of the model:", "{:.2%}".format(rec4))
print("\033[1m Log Loss of the model:", round(log4,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc4 = roc_auc_score(y_test, y_pred4)
fpr, tpr, thresholds = roc_curve(y_test, dt.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Decision Trees (area = %0.4f)' % roc4)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs4=metrics.f1_score(y_test, y_pred4)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs4))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc4))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, random_state=47)
rf.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(rf.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(rf.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc5=rf.score(X_test, y_test)
print(acc5)
y_pred5 = rf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm5 = confusion_matrix(y_true=y_test, y_pred=y_pred5)
cmDf5=pd.DataFrame(cm5, index=rf.classes_, columns=rf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf5)
print("\033[1m The result is telling us that we have: ",(cm5[0,0]+cm5[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm5[0,1]+cm5[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm5.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred5))
per5=metrics.precision_score(y_test, y_pred5)
rec5=metrics.recall_score(y_test, y_pred5)
log5=metrics.log_loss(y_test, y_pred5)
print("\033[1m Precision of the model:", "{:.2%}".format(per5))
print("\033[1m Recall of the model:", "{:.2%}".format(rec5))
print("\033[1m Log Loss of the model:", round(log5,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc5 = roc_auc_score(y_test, y_pred5)
fpr, tpr, thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.4f)' % roc5)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs5=metrics.f1_score(y_test, y_pred5)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs5))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc5))
import sklearn.naive_bayes as nb
nbc = nb.GaussianNB()
nbc.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(nbc.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(nbc.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc6=nbc.score(X_test, y_test)
print(acc6)
y_pred6 = nbc.predict(X_test)
from sklearn.metrics import confusion_matrix
cm6 = confusion_matrix(y_true=y_test, y_pred=y_pred6)
cmDf6=pd.DataFrame(cm6, index=nbc.classes_, columns=nbc.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf6)
print("\033[1m The result is telling us that we have: ",(cm6[0,0]+cm6[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm6[0,1]+cm6[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm6.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred6))
per6=metrics.precision_score(y_test, y_pred6)
rec6=metrics.recall_score(y_test, y_pred6)
log6=metrics.log_loss(y_test, y_pred6)
print("\033[1m Precision of the model:", "{:.2%}".format(per6))
print("\033[1m Recall of the model:", "{:.2%}".format(rec6))
print("\033[1m Log Loss of the model:", round(log6,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc6 = roc_auc_score(y_test, y_pred6)
fpr, tpr, thresholds = roc_curve(y_test, nbc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Naïve Bayes (area = %0.4f)' % roc6)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs6=metrics.f1_score(y_test, y_pred6)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs6))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc6))
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth=26, random_state=47)
gb.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(gb.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(gb.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc7=gb.score(X_test, y_test)
print(acc7)
y_pred7 = gb.predict(X_test)
from sklearn.metrics import confusion_matrix
cm7 = confusion_matrix(y_true=y_test, y_pred=y_pred7)
cmDf7=pd.DataFrame(cm7, index=gb.classes_, columns=gb.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf7)
print("\033[1m The result is telling us that we have: ",(cm7[0,0]+cm7[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm7[0,1]+cm7[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm7.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred7))
per7=metrics.precision_score(y_test, y_pred7)
rec7=metrics.recall_score(y_test, y_pred7)
log7=metrics.log_loss(y_test, y_pred7)
print("\033[1m Precision of the model:", "{:.2%}".format(per7))
print("\033[1m Recall of the model:", "{:.2%}".format(rec7))
print("\033[1m Log Loss of the model:", round(log7,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc7 = roc_auc_score(y_test, y_pred7)
fpr, tpr, thresholds = roc_curve(y_test, gb.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Gradient Boosting (area = %0.4f)' % roc7)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs7=metrics.f1_score(y_test, y_pred7)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs7))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc7))
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(max_depth=3, random_state=47)
et.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(et.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(et.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc8=et.score(X_test, y_test)
print(acc8)
y_pred8 = et.predict(X_test)
from sklearn.metrics import confusion_matrix
cm8 = confusion_matrix(y_true=y_test, y_pred=y_pred8)
cmDf8=pd.DataFrame(cm8, index=et.classes_, columns=et.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf8)
print("\033[1m The result is telling us that we have: ",(cm8[0,0]+cm8[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm8[0,1]+cm8[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm8.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred8))
per8=metrics.precision_score(y_test, y_pred8)
rec8=metrics.recall_score(y_test, y_pred8)
log8=metrics.log_loss(y_test, y_pred8)
print("\033[1m Precision of the model:", "{:.2%}".format(per8))
print("\033[1m Recall of the model:", "{:.2%}".format(rec8))
print("\033[1m Log Loss of the model:", round(log8,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc8 = roc_auc_score(y_test, y_pred8)
fpr, tpr, thresholds = roc_curve(y_test, et.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Extra Trees (area = %0.4f)' % roc8)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs8=metrics.f1_score(y_test, y_pred8)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs8))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc8))
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]
from xgboost import XGBClassifier
xg = XGBClassifier(max_depth=14, random_state=47)
xg.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(xg.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(xg.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc9=xg.score(X_test, y_test)
print(acc9)
y_pred9 = xg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm9 = confusion_matrix(y_true=y_test, y_pred=y_pred9)
cmDf9=pd.DataFrame(cm9, index=xg.classes_, columns=xg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf9)
print("\033[1m The result is telling us that we have: ",(cm9[0,0]+cm9[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm9[0,1]+cm9[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm9.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred9))
per9=metrics.precision_score(y_test, y_pred9)
rec9=metrics.recall_score(y_test, y_pred9)
log9=metrics.log_loss(y_test, y_pred9)
print("\033[1m Precision of the model:", "{:.2%}".format(per9))
print("\033[1m Recall of the model:", "{:.2%}".format(rec9))
print("\033[1m Log Loss of the model:", round(log9,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc9 = roc_auc_score(y_test, y_pred9)
fpr, tpr, thresholds = roc_curve(y_test, xg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost (area = %0.4f)' % roc9)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs9=metrics.f1_score(y_test, y_pred9)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs9))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc9))
from sklearn.ensemble import AdaBoostClassifier
ad = AdaBoostClassifier(n_estimators=1, random_state=47)
ad.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(ad.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(ad.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc10=ad.score(X_test, y_test)
print(acc10)
y_pred10 = ad.predict(X_test)
from sklearn.metrics import confusion_matrix
cm10 = confusion_matrix(y_true=y_test, y_pred=y_pred10)
cmDf10=pd.DataFrame(cm10, index=ad.classes_, columns=ad.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf10)
print("\033[1m The result is telling us that we have: ",(cm10[0,0]+cm10[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm10[0,1]+cm10[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm10.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred10))
per10=metrics.precision_score(y_test, y_pred10)
rec10=metrics.recall_score(y_test, y_pred10)
log10=metrics.log_loss(y_test, y_pred10)
print("\033[1m Precision of the model:", "{:.2%}".format(per10))
print("\033[1m Recall of the model:", "{:.2%}".format(rec10))
print("\033[1m Log Loss of the model:", round(log10,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc10 = roc_auc_score(y_test, y_pred10)
fpr, tpr, thresholds = roc_curve(y_test, ad.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AdaBoost (area = %0.4f)' % roc10)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs10=metrics.f1_score(y_test, y_pred10)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs10))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc10))
models = ['K-Nearest Neighbors','Logistic Regression',
'Support Vector Machine Classifier', 'Decision Tree Classifier',
'Random Forest Classifier', 'Naïve Bayes Classifier',
'Gradient Boosting Classifier','Extra Trees Classifier',
'XGBoost Classifier','AdaBoost Classifier']
tests_f1score = [fs1, fs2, fs3, fs4, fs5, fs6, fs7, fs8, fs9, fs10]
tests_acc = [acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8, acc9, acc10]
compare_models = pd.DataFrame({ "Algorithms": models, "Tests f1-score": tests_f1score, "Tests Accuracy": tests_acc})
compare_models.sort_values(by = "Tests f1-score", ascending = False)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,8))
sns.barplot(x = "Tests f1-score", y = "Algorithms", data = compare_models)
plt.show()
default_features = [x for i,x in enumerate(cols) if i!=30]
def plot_feature_importances_default(model):
plt.figure(figsize=(10,10))
n_features = len(cols)
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), default_features)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
plot_feature_importances_default(et)
plt.savefig('feature_importance')
ET_clf = ExtraTreesClassifier(max_depth=3, random_state=47)
RF_clf = RandomForestClassifier(max_depth=5, random_state=47)
DT_clf = DecisionTreeClassifier(max_depth=1, random_state=47)
ET_clf.fit(X_train, y_train)
RF_clf.fit(X_train, y_train)
DT_clf.fit(X_train, y_train)
ET_pred = ET_clf.predict(X_test)
RF_pred = RF_clf.predict(X_test)
DT_pred = DT_clf.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score, log_loss
averaged_preds = (ET_pred + RF_pred + DT_pred)//3
print('\n accuracy:')
print('---------------')
acc11=accuracy_score(y_test,averaged_preds)
print(acc11)
averaged_preds = (ET_pred + RF_pred + DT_pred)//3
from sklearn.metrics import confusion_matrix
cm11 = confusion_matrix(y_true=y_test, y_pred=averaged_preds)
cmDf11=pd.DataFrame(cm11, index=DT_clf.classes_, columns=DT_clf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf11)
print("\033[1m The result is telling us that we have: ",(cm11[0,0]+cm11[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm11[0,1]+cm11[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm11.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, averaged_preds))
per11=metrics.precision_score(y_test, averaged_preds)
rec11=metrics.recall_score(y_test, averaged_preds)
log11=metrics.log_loss(y_test, averaged_preds)
print("\033[1m Precision of the model:", "{:.2%}".format(per11))
print("\033[1m Recall of the model:", "{:.2%}".format(rec11))
print("\033[1m Log Loss of the model:", round(log11,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc11 = roc_auc_score(y_test, averaged_preds)
average_proba = (ET_clf.predict_proba(X_test)[:,1]+ RF_clf.predict_proba(X_test)[:,1]+ DT_clf.predict_proba(X_test)[:,1])//3
fpr, tpr, thresholds = roc_curve(y_test, average_proba)
plt.figure()
plt.plot(fpr, tpr, label='Simple Averaging Approach (area = %0.4f)' % roc11)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs11=metrics.f1_score(y_test, averaged_preds)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs11))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc11))
voting_clf = VotingClassifier(estimators=[('ET', ET_clf), ('RF', RF_clf), ('DT', DT_clf)], voting='soft')
voting_clf.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(voting_clf.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(voting_clf.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc12=voting_clf.score(X_test, y_test)
print(acc12)
voting_preds = voting_clf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm12 = confusion_matrix(y_true=y_test, y_pred=voting_preds)
cmDf12=pd.DataFrame(cm12, index=voting_clf.classes_, columns=voting_clf.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf12)
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds))
per12=metrics.precision_score(y_test, voting_preds)
rec12=metrics.recall_score(y_test, voting_preds)
log12=metrics.log_loss(y_test, voting_preds)
print("\033[1m Precision of the model:", "{:.2%}".format(per12))
print("\033[1m Recall of the model:", "{:.2%}".format(rec12))
print("\033[1m Log Loss of the model:", round(log12,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc12 = roc_auc_score(y_test, voting_preds)
fpr, tpr, thresholds = roc_curve(y_test, voting_clf.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Voting\Stacking Classification (area = %0.4f)' % roc12)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs12=metrics.f1_score(y_test, voting_preds)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs12))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc12))
from sklearn.ensemble import BaggingClassifier
DT_Bagg = BaggingClassifier(base_estimator=DT_clf, n_estimators=50, random_state=47)
RF_Bagg = BaggingClassifier(base_estimator=RF_clf, n_estimators=50, random_state=47)
ET_Bagg = BaggingClassifier(base_estimator=ET_clf, n_estimators=50, random_state=47)
DT_Bagg.fit(X_train, y_train)
RF_Bagg.fit(X_train, y_train)
ET_Bagg.fit(X_train, y_train)
voting_Bagg = VotingClassifier(estimators=[('ET', ET_Bagg), ('RF', RF_Bagg), ('DT', DT_Bagg)], voting='soft')
voting_Bagg.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(voting_Bagg.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(voting_Bagg.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc13=voting_Bagg.score(X_test, y_test)
print(acc13)
voting_preds1 = voting_Bagg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm13 = confusion_matrix(y_true=y_test, y_pred=voting_preds1)
cmDf13=pd.DataFrame(cm13, index=voting_Bagg.classes_, columns=voting_Bagg.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf13)
print("\033[1m The result is telling us that we have: ",(cm13[0,0]+cm13[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm13[0,1]+cm13[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm13.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds1))
per13=metrics.precision_score(y_test, voting_preds1)
rec13=metrics.recall_score(y_test, voting_preds1)
log13=metrics.log_loss(y_test, voting_preds1)
print("\033[1m Precision of the model:", "{:.2%}".format(per13))
print("\033[1m Recall of the model:", "{:.2%}".format(rec13))
print("\033[1m Log Loss of the model:", round(log13,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc13 = roc_auc_score(y_test, voting_preds1)
fpr, tpr, thresholds = roc_curve(y_test, voting_Bagg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Bagging Classification (area = %0.4f)' % roc13)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs13=metrics.f1_score(y_test, voting_preds1)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs13))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc13))
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]
X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_test.columns.values]
from sklearn.ensemble import BaggingClassifier
Ada_Boost = AdaBoostClassifier(n_estimators=1, random_state=47)
XG_Boost = XGBClassifier(max_depth=14, random_state=47)
Grad_Boost = GradientBoostingClassifier(max_depth=26, random_state=47)
Ada_Boost.fit(X_train, y_train)
XG_Boost.fit(X_train, y_train)
Grad_Boost.fit(X_train, y_train)
voting_Boost = VotingClassifier(estimators=[('Grad', Grad_Boost), ('XG', XG_Boost), ('Ada', Ada_Boost)], voting='soft')
voting_Boost.fit(X_train, y_train)
print("Accuracy on training set:", "{:.2%}".format(voting_Boost.score(X_train, y_train)))
print("Accuracy on test set:", "{:.2%}".format(voting_Boost.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc14=voting_Boost.score(X_test, y_test)
print(acc14)
voting_preds2 = voting_Boost.predict(X_test)
from sklearn.metrics import confusion_matrix
cm14 = confusion_matrix(y_test, voting_preds2)
cmDf14=pd.DataFrame(cm14, index=voting_Boost.classes_, columns=voting_Boost.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf14)
print("\033[1m The result is telling us that we have: ",(cm14[0,0]+cm14[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm14[0,1]+cm14[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm14.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, voting_preds2))
per14=metrics.precision_score(y_test, voting_preds2)
rec14=metrics.recall_score(y_test, voting_preds2)
log14=metrics.log_loss(y_test, voting_preds2)
print("\033[1m Precision of the model:", "{:.2%}".format(per14))
print("\033[1m Recall of the model:", "{:.2%}".format(rec14))
print("\033[1m Log Loss of the model:", round(log14,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc14 = roc_auc_score(y_test, voting_preds2)
fpr, tpr, thresholds = roc_curve(y_test, voting_Boost.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Boosting Classification (area = %0.4f)' % roc14)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs14=metrics.f1_score(y_test, voting_preds2)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs14))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc14))
methods = ['Simple Averaging Approach','Voting/Stacking Classification',
'Bagging Classification','Boosting Classification']
tests_f1score = [fs11, fs12, fs13, fs14]
tests_acc = [acc11, acc12, acc13, acc14]
compare_models = pd.DataFrame({ "Methods": methods, "Tests f1-score": tests_f1score, "Tests Accuracy": tests_acc})
compare_models.sort_values(by = "Tests f1-score", ascending = False)
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,8))
sns.barplot(x = "Tests f1-score", y = "Methods", data = compare_models)
plt.show()
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=47)
mlp.fit(X_train, y_train)
print("Accuracy on training set: {:.2f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(mlp.score(X_test, y_test)))
print('\n accuracy:')
print('---------------')
acc15=mlp.score(X_test, y_test)
print(acc15)
y_pred15 = mlp.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm15 = confusion_matrix(y_test, y_pred15)
cmDf15=pd.DataFrame(cm15, index=mlp.classes_, columns=mlp.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf15)
print("\033[1m The result is telling us that we have: ",(cm15[0,0]+cm15[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm15[0,1]+cm15[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm15.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred15))
per15=metrics.precision_score(y_test, y_pred15)
rec15=metrics.recall_score(y_test, y_pred15)
log15=metrics.log_loss(y_test, y_pred15)
print("\033[1m Precision of the model:", "{:.2%}".format(per15))
print("\033[1m Recall of the model:", "{:.2%}".format(rec15))
print("\033[1m Log Loss of the model:", round(log15,3))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc15 = roc_auc_score(y_test, y_pred15)
fpr, tpr, thresholds = roc_curve(y_test, mlp.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Multi-layer Perceptron (area = %0.4f)' % roc15)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
fs15=metrics.f1_score(y_test, y_pred15)
print("\033[1m The class 1 f1-score is:", "{:.4%}".format(fs15))
print("\033[1m The Model ROC AUC is:", "{:.4%}".format(roc15))
models = ['Extra Trees Classifier','Simple Averaging Approach',
'Multi-Layer Perceptron']
tests_f1score = [fs8, fs11, fs15]
tests_acc = [acc8, acc11, acc15]
compare_models = pd.DataFrame({ "Algorithms": models, "Tests f1-score": tests_f1score, "Tests Accuracy": tests_acc})
compare_models.sort_values(by = "Tests f1-score", ascending = False)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=47)
mlp.fit(X_train, y_train)
print("\033[1m Accuracy of Multi-Layer Perceptron classifier on test set:", "{:.4f}".format(mlp.score(X_test, y_test)))
Cross validation attempts to avoid overfitting while still producing a prediction for each observation dataset. We are using 10-fold Cross-Validation to train our model.
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=47)
modelCV = mlp
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("\033[1m 10-fold cross validation average accuracy:", "{:.4f}".format((results.mean())))
y_pred15 = mlp.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm15 = confusion_matrix(y_test, y_pred15)
cmDf15=pd.DataFrame(cm15, index=mlp.classes_, columns=mlp.classes_)
print('\nConfusion Matrix')
print('-------------------')
print(cmDf15)
print('-------------------')
print("\033[1m The result is telling us that we have: ",(cm15[0,0]+cm15[1,1]),"correct predictions.")
print("\033[1m The result is telling us that we have: ",(cm15[0,1]+cm15[1,0]),"incorrect predictions.")
print("\033[1m We have a total predictions of: ",(cm15.sum()))
from sklearn.metrics import classification_report
print('\nclassification_report')
print('------------------------')
print(classification_report(y_test, y_pred15))
To quote from Scikit Learn:
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.
The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.
The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.
The support is the number of occurrences of each class in y_test.
#calculate Accuracy, how often is the classifier correct?
print("\nAccuracy of MLP Classifier:", "{:.2%}".format(metrics.accuracy_score(y_test, y_pred15)))
print("Accuracy: Well, we got a classification rate of", "{:.2%}".format(metrics.accuracy_score(y_test, y_pred15)))
#calculate Precision
print("\nPrecision of MLP Classifier:", "{:.2%}".format(metrics.precision_score(y_test, y_pred15)))
print("Precision: Precision is about being precise, i.e., how precise our model is. In other words, we can say, when a model makes a prediction, how often it is correct. In our prediction case, when our model predict a loan is about to default, that loan actually defaulted", "{:.2%}".format(metrics.precision_score(y_test, y_pred15)) ,"of the time.")
#calculate Recall
print("\nRecall of MLP Classifier:", "{:.2%}".format(metrics.recall_score(y_test, y_pred15)))
print("Recall: If there is a loan that defaulted present in the test set, our model can identify it", "{:.2%}".format(metrics.recall_score(y_test, y_pred15)) ,"of the time.")
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
mlp_roc_auc = roc_auc_score(y_test, mlp.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, mlp.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label= 'Multi-Layer Perceptron (area = %0.4f)' % mlp_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
Finally, we plot a heat map of the first layer weights in a neural network learned on the default data set.
default_features = [x for i,x in enumerate(cols) if i!=30]
plt.figure(figsize=(20, 5))
plt.imshow(mlp.coefs_[0], interpolation='none', cmap='viridis')
plt.yticks(range(8), default_features)
plt.xlabel("Columns in weight matrix")
plt.ylabel("Input feature")
plt.colorbar()
From the heat map, it is not easy to point out quickly that which feature (features) have relatively low weights compared to the other features.
Now our model has been built, let me use it for real time predictions.
chosenModel = cols
chosenModel
df_loan_dummies['Probability_to_Default'] = mlp.predict_proba(df_loan_dummies[chosenModel])[:,1]
df_loan_dummies["LOAN_ID"] = df['LOAN_ID']
df_loan_dummies["TRUE"]=df_loan_dummies["DEFAULT"]
df_loan_dummies["PREDICTED"]=mlp.predict(X)
df_loan_dummies[["LOAN_ID","TRUE","PREDICTED", "Probability_to_Default"]].head(10)
df_loan_dummies.to_csv('Prob_to_Default.csv', index=False, encoding='utf-8')
a_1 = int(input("Please enter the loan amount without comma-separated (for example, 20000):"))
a_2 = float(input("Please enter the loan rate in percentage (for example, 17.93):"))
a_3 = float(input("Please enter the monthly payment (for example, 342.94):"))
a_4 = float(input("Please enter the annual income without comma-separated (for example, 344304):"))
a_5 = float(input("Please enter the debt to income ratio in in percentage (for example, 18.47):"))
a_6 = int(input("Does the loan term is 36 months (1 if yes, 0 otherwise)? (for example, 0):"))
a_7 = int(input("Does the loan term is 60 months (1 if yes, 0 otherwise)? (for example, 1):"))
a_8 = int(input("Does the loan purpose is for a car (1 if yes, 0 otherwise)? (for example, 0):"))
a_9 = int(input("Does the loan purpose is for a credit card (1 if yes, 0 otherwise)? (for example, 0):"))
a_10 = int(input("Does the loan purpose is for a debt consolidation (1 if yes, 0 otherwise)? (for example, 1):"))
a_11 = int(input("Does the loan purpose is for an education (1 if yes, 0 otherwise)? (for example, 0):"))
a_12 = int(input("Does the loan purpose is for a home improvement (1 if yes, 0 otherwise)? (for example, 0):"))
a_13 = int(input("Does the loan purpose is for a house (1 if yes, 0 otherwise)? (for example, 0):"))
a_14 = int(input("Does the loan purpose is for a major purchase (1 if yes, 0 otherwise)? (for example, 0):"))
a_15 = int(input("Does the loan purpose is for a medical treatment (1 if yes, 0 otherwise)? (for example, 0):"))
a_16 = int(input("Does the loan purpose is for moving (1 if yes, 0 otherwise)? (for example, 0):"))
a_17 = int(input("Does the loan purpose is for other purpose (1 if yes, 0 otherwise)? (for example, 0):"))
a_18 = int(input("Does the loan purpose is for a renewable_energy (1 if yes, 0 otherwise)? (for example, 0):"))
a_19 = int(input("Does the loan purpose is for a small_business (1 if yes, 0 otherwise)? (for example, 0):"))
a_20 = int(input("Does the loan purpose is for a vacation (1 if yes, 0 otherwise)? (for example, 0):"))
a_21 = int(input("Does the loan purpose is for a wedding (1 if yes, 0 otherwise)? (for example, 0):"))
a_22 = int(input("Does the loan applicant's employment length range between 1 to 2 years (1 if yes, 0 otherwise)? (for example, 1):"))
a_23 = int(input("Does the loan applicant's employment length range between 3 to 4 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_24 = int(input("Does the loan applicant's employment length range between 5 to 6 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_25 = int(input("Does the loan applicant's employment length range between 7 to 8 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_26 = int(input("Does the loan applicant's employment length range between 9 to 10 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_27 = int(input("Is the loan applicant's employment lower than 1 year (1 if yes, 0 otherwise)? (for example, 0):"))
a_28 = int(input("Is the loan applicant's employment higher than 10 years (1 if yes, 0 otherwise)? (for example, 0):"))
a_29 = int(input("Does the loan applicant rent a home (1 if yes, 0 otherwise)? (for example, 0):"))
a_30 = int(input("Does the loan applicant own a home (1 if yes, 0 otherwise)? (for example, 1):"))
new_data = np.array([a_1,a_2,a_3,a_4,a_5,a_6,a_7,a_8,a_9,a_10,a_11,a_12,a_13,a_14,a_15,
a_16,a_17,a_18,a_19,a_20,a_21,a_22,a_23,a_24,a_25,a_26,a_27,a_28,
a_29,a_30]).reshape(1,-1)
new_pred=mlp.predict(new_data)
new_prob=mlp.predict_proba(new_data)
int(new_pred[0])
if int(new_pred[0]) == 1:
print("\033[1m \nThe new loan is predicted to default (Don't give this applicant any money!!!!)\033[1m")
print("\033[1m \nThe default probability of this applicant is", "{:.4%}".format(max(new_prob[0])))
else:
print("\033[1m \nThe new loan is predicted not to default (Feel free to give this applicant a loan)\033[1m")
print("\033[1m \nThe default probability of this applicant is", "{:.4%}".format(min(new_prob[0])))